FetchListTool.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.tools;

import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;
import java.util.logging.*;

import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.pagedb.*;
import net.nutch.linkdb.*;

/**********************************************
 * This class takes an IWebDBReader, computes a relevant subset,
 * and then emits the subset.
 *
 * @author Mike Cafarella
 ***********************************************/
public class FetchListTool {
    public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.FetchListTool");
    private static String TOP_N_SORTER = "topNSorter";

    private static final long FETCH_GENERATION_DELAY_MS = 7 * 24 * 60 * 60 * 1000;

    File dbDir;
    boolean refetchOnly, anchorOptimize;
    float cutoffScore;
    int seed;

    /**
     * The TableSet class will allocate a given FetchListEntry
     * into one of several ArrayFiles.  It chooses which
     * ArrayFile based on a hash of the URL's domain name.
     *
     * It uses a hash of the domain name so that pages are
     * allocated to a random ArrayFile, but same-host pages
     * go to the same file (for efficiency purposes during
     * fetch).
     *
     * Further, within a given file, the FetchListEntry items
     * appear in random order.  This is so that we don't
     * hammer the same site over and over again during fetch.
     *
     * Each table should receive a roughly
     * even number of entries, but all URLs for a  specific 
     * domain name will be found in a single table.  If
     * the dataset is weirdly skewed toward large domains,
     * there may be an uneven distribution.
     */
    class TableSet {
        Vector outputPaths = new Vector();
        Vector tables = new Vector();
        long appendCounts[];
        boolean hasAppended = false;

        /**
         */
        public TableSet() {
        }

        /**
         * Add a table to the list.  Cannot be called
         * after we start appending entries.
         */
        public synchronized boolean add(String outputPath) throws IOException {
            if (hasAppended) {
                return false;
            }

            //
            // Record where the file should go.  Then open a
            // SequenceFile.Writer to record the set of items
            // we append to each table.
            //
            outputPaths.add(outputPath);
            tables.add(new SequenceFile.Writer(outputPath + ".unsorted", MD5Hash.class, FetchListEntry.class));
            return true;
        }

        /**
         * Add FetchListEntry items to one of the tables.
         * Choose the table based on a hash of the URL domain name.
         */
        public synchronized boolean append(FetchListEntry newEntry) throws IOException {
            hasAppended = true;
            if (appendCounts == null) {
                appendCounts = new long[outputPaths.size()];
            }

            Page fetchPage = newEntry.getPage();

            // Extract the hostname from the URL
            String host = null;
            try {
                host = new URL(fetchPage.getURL().toString()).getHost().toLowerCase();
            } catch (MalformedURLException e) {
                // ignore bad URLs
                return false;
            }

            // Figure out which table is getting the item
            MD5Hash hash = MD5Hash.digest(host);
            int index = Math.abs(hash.hashCode()^seed) % tables.size();

            // Write it down and return
            SequenceFile.Writer writer = (SequenceFile.Writer) tables.elementAt(index);
            writer.append(fetchPage.getMD5(), newEntry);
            appendCounts[index]++;

            return true;
        }

        /**
         * Close down the TableSet, so there are no more FetchListEntries
         * expected.  We now:
         * a) Close down all the SequenceFile.Writer objects.
         * b) Sort each file
         * c) Read each newly-sorted file and copy to an ArrayFile
         */
        public synchronized void close() throws IOException {
            hasAppended = true;

            // A) Close all the SequenceFile.Writers
            for (Enumeration e = tables.elements(); e.hasMoreElements(); ) {
                ((SequenceFile.Writer) e.nextElement()).close();
            }

            // B) Sort the edit-files
            SequenceFile.Sorter sorter = new SequenceFile.Sorter(new MD5Hash.Comparator(), FetchListEntry.class);

            //
            // Iterate through each unsorted file.  Sort it (while
            // measuring the time taken) and upon completion delete
            // the unsorted version.
            //
            long totalEntries = 0;
            double totalTime = 0;
            int i = 0;
            for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); i++) {
                String name = (String) e.nextElement();
                String unsortedName = name + ".unsorted";

                long localStart = System.currentTimeMillis();
                sorter.sort(unsortedName, name + ".sorted");
                long localEnd = System.currentTimeMillis();

                if (appendCounts != null) {
                    double localSecs = ((localEnd - localStart) / 1000.0);
                    LOG.info("Processing " + unsortedName + ": Sorted " + appendCounts[i] + " entries in " + localSecs + " seconds.");
                    LOG.info("Processing " + unsortedName + ": Sorted " + (localSecs / appendCounts[i]) + " entries/second");

                    totalEntries += appendCounts[i];
                    totalTime += localSecs;
                }

                new File(name + ".unsorted").delete();
            }

            LOG.info("Overall processing: Sorted " + totalEntries + " entries in " + totalTime + " seconds.");
            LOG.info("Overall processing: Sorted " + (totalTime / totalEntries) + " entries/second");

            // C) Read in each newly-sorted file.  Copy to an ArrayFile.
            for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); ) {
                String name = (String) e.nextElement();
                SequenceFile.Reader reader = new SequenceFile.Reader(name + ".sorted");
                ArrayFile.Writer af = new ArrayFile.Writer(name, FetchListEntry.class);
                try {
                    MD5Hash key = new MD5Hash();
                    FetchListEntry fle = new FetchListEntry();
                    while (reader.next(key, fle)) {
                        af.append(fle);
                    }
                } finally {
                    af.close();
                    reader.close();
                    new File(name + ".sorted").delete();
                }
            }
        }
    }

    /*************************************
     * SortableScore is just a WritableComparable Float!
     *************************************/
    public static class SortableScore implements WritableComparable {
        float score;

        /**
         */
        public SortableScore() {
        }

        /**
         */
        public void set(float score) {
            this.score = score;
        }

        /**
         */
        public float getFloat() {
            return score;
        }


        ////////
        // WritableComparable
        ////////

        /**
         * Sort them in descending order!
         */
        public int compareTo(Object o) {
            SortableScore otherScore = (SortableScore) o;

            if (score < otherScore.score) {
                return 1;
            } else if (score == otherScore.score) {
                return 0;
            } else {
                return -1;
            }
        }

        /**
         */
        public void write(DataOutput out) throws IOException {
            out.writeFloat(score);
        }

        /**
         */
        public void readFields(DataInput in) throws IOException {
            this.score = in.readFloat();
        }
    }

    /**
     * FetchListTool takes a page db, and emits a RECNO-based
     * subset of it.
     */
    public FetchListTool(File dbDir, boolean refetchOnly, boolean anchorOptimize, float cutoffScore, int seed) throws IOException, FileNotFoundException {
        this.dbDir = dbDir;
        this.refetchOnly = refetchOnly;
        this.anchorOptimize = anchorOptimize;
        this.cutoffScore = cutoffScore;
        this.seed = seed;
    }

    /**
     * Spit out several fetchlists, so that we can fetch across
     * several machines.  
     */
    public void emitMultipleLists(File dir, int numLists, long topN, long curTime) throws IOException {
        //
        // Create tables (and directories) for each fetchlist we want.
        // Add them all to a TableSet object.
        //
        TableSet tables = new TableSet();
        try {
            String datePrefix = getDate();

            File workingDir = new File(dir, "tmp_" + getDate());
            workingDir.mkdirs();
            try {
                for (int i = 0; i < numLists; i++) {
                    File subdir = new File(dir, datePrefix + "-" + i);
                    subdir.mkdir();
                    File file = new File(subdir, FetchListEntry.DIR_NAME);

                    tables.add(file.getPath());
                }

                // Now go through the fetchlist.
                emitFetchList(tables, workingDir, topN, curTime);
            } finally {
                FileUtil.fullyDelete(workingDir);
            }
        } finally {
            tables.close();
        }
    }

    /**
     * Spit out the fetchlist, to a BDB at the indicated filename.
     */
    public void emitFetchList(File segmentDir, long topN, long curTime) throws IOException {
        TableSet tables = new TableSet();
        File workingDir = new File(segmentDir, "tmp_" + getDate());
        workingDir.mkdirs();
        File subdir = new File(segmentDir, getDate());
        subdir.mkdir();

        try {
            tables.add(new File(subdir, FetchListEntry.DIR_NAME).getPath());
        
            try {
                emitFetchList(tables, workingDir, topN, curTime);
            } finally {
                tables.close();
            }
        } finally {
            FileUtil.fullyDelete(workingDir);
        }
    }

    private static String getDate() {
        return new SimpleDateFormat("yyyyMMddHHmmss").format
            (new Date(System.currentTimeMillis()));
    }

    /**
     * Emit the fetchlist, with the given TableSet.  The TableSet is
     * responsible for actually appending the item to the output file, 
     * which is from this function.
     */
    void emitFetchList(TableSet tables, File workingDir, long topN, long curTime) throws IOException {
        // Iterate through all the Pages, by URL.  Iterating
        // through by URL means we can save disk seeks when
        // calling webdb.getLinks(URL).  
        //
        // However, we don't really want the output to be in URL-ordered
        // format.  We would like the output to be URL-randomized, which
        // an MD5-ordering preserves nicely.  But we assume here that
        // TableSet will do that randomizing for us.  We just need to
        // make sure we give it a good sampling of our data.  (That is,
        // if we are giving TableSet fewer than the max-possible items,
        // we should make sure the items come evenly from all over the
        // db.)
        //
        long count = 0;
        TreeMap anchorTable = new TreeMap();
        Vector unknownDomainLinks = new Vector();

        //
        // Create a comparator that matches the domainIDs for
        // Link objects.
        //
        Comparator domainComparator = new Comparator() {
            public int compare(Object o1, Object o2) {
                Link l1 = (Link) o1;
                Link l2 = (Link) o2;
                if (l1.getDomainID() < l2.getDomainID()) {
                    return -1;
                } else if (l1.getDomainID() == l2.getDomainID()) {
                    return 0;
                } else {
                    return 1;
                }
            }
        };

        //
        // Go through all the pages by URL.  Filter the ones
        // we really don't want, and save the others for possible
        // emission.
        //
        SortableScore curScore = new SortableScore();
        File unsortedFile = new File(workingDir, TOP_N_SORTER + ".unsorted");
        SequenceFile.Writer writer = new SequenceFile.Writer(unsortedFile.getPath(), SortableScore.class, FetchListEntry.class);
        try {
            IWebDBReader webdb = new WebDBReader(dbDir);
            try {
                for (Enumeration e = webdb.pages(); e.hasMoreElements(); count++) {
                    // Grab the next Page.
                    Page page = (Page) e.nextElement();
                    boolean shouldFetch = true;

                    if (((count % 50000) == 0) && (count != 0)) {
                        LOG.info("Processing page " + count + "...");
                    }

                    //
                    // Don't emit it if the Page's score doesn't meet
                    // our cutoff value
                    //
                    if ((cutoffScore >= 0) && (page.getScore() < cutoffScore)) {
                        continue;
                    }

                    //
                    // If the item is not yet ready to be fetched, move on.
                    //
                    // Also, if getNextFetchTime is set to Long.MAX_VALUE,
                    // then it should never be fetched.
                    //
                    if (page.getNextFetchTime() > curTime ||
                        page.getNextFetchTime() == Long.MAX_VALUE) {
                        continue;
                    }

                    //
                    // If we're in refetchOnly mode, set shouldFetch to FALSE
                    // for any Pages whose URL's MD5 is the same as the 
                    // listed MD5.  That indicates that no content has been 
                    // downloaded in the past.
                    //
                    if (refetchOnly) {
                        MD5Hash urlHash = MD5Hash.digest(page.getURL());
                        if (page.getMD5().equals(urlHash)) {
                            shouldFetch = false;
                        }
                    }

                    //
                    // If anchorOptimize mode is on, AND shouldFetch is
                    // false, then we might apply a further optimization.  
                    // Since a non-fetched Page (that is, a URL-only 
                    // item) can only be discovered via the incoming 
                    // anchor text, we can skip those Pages that have 
                    // only *empty* incoming anchor text.
                    //
                    Link inlinks[] = webdb.getLinks(page.getURL());
                    if ((! shouldFetch) && anchorOptimize) {
                        boolean foundUsefulAnchor = false;
                        for (int i = 0; i < inlinks.length; i++) {
                            UTF8 anchorText = inlinks[i].getAnchorText();
                            if ((anchorText != null) &&
                                (anchorText.toString().trim().length() > 0)) {
                                foundUsefulAnchor = true;
                                break;
                            }
                        }
                        if (! foundUsefulAnchor) {
                            continue;
                        }
                    }
            
                    //
                    // Uniquify identical anchor text strings by source 
                    // domain.  If the anchor text is identical, and 
                    // the domains are identical, then the anchor should
                    // only be included once.
                    //
                    // Links will arrive in the array sorted first by URL,
                    // and then by source-MD5.
                    //
                    int uniqueAnchors = 0;
                    for (int i = 0; i < inlinks.length; i++) {
                        String anchor = inlinks[i].getAnchorText().toString().trim();

                        if (anchor.length() > 0) {
                            if (inlinks[i].getDomainID() == 0) {
                                unknownDomainLinks.add(anchor);
                            } else {
                                Set domainUniqueLinks = (Set) anchorTable.get(anchor);
                                if (domainUniqueLinks == null) {
                                    domainUniqueLinks = new TreeSet(domainComparator);
                                    anchorTable.put(anchor, domainUniqueLinks);
                                }
                                if (domainUniqueLinks.add(inlinks[i])) {
                                    uniqueAnchors++;
                                }
                            }
                        }
                    }

                    //
                    // Finally, collect the incoming anchor text for
                    // the current URL.  Step one is to add the incoming
                    // anchors whose links' source-domains are unknown.
                    // (The target, obviously, the URL we're currently
                    // processing)
                    //
                    int i = 0;
                    String results[] = new String[uniqueAnchors + unknownDomainLinks.size()];
                    for (Enumeration e2 = unknownDomainLinks.elements(); e2.hasMoreElements(); i++) {
                        results[i] = (String) e2.nextElement();
                    }
                    unknownDomainLinks.clear();

                    //
                    // Step 2, add the anchors that have actually been 
                    // uniquified by source-domain.
                    //
                    for (Iterator it = anchorTable.keySet().iterator(); it.hasNext(); ) {
                        String key = (String) it.next();
                        Set domainUniqueLinks = (Set) anchorTable.get(key);

                        for (int j = 0; j < domainUniqueLinks.size(); j++) {
                            results[i++] = key;
                        }
                    }
                    anchorTable.clear();

                    // 
                    // Last, add the FetchListEntry to a file so we can
                    // sort by score.  Be sure to modify the Page's
                    // fetchtime; this allows us to soon generate
                    // another fetchlist which would not include this
                    // Page.  That's helpful because with two distinct
                    // fetchlists, it should be possible to fetch and
                    // perform dbupdate at the same time.
                    //
                    curScore.set(page.getScore());
                    page.setNextFetchTime(page.getNextFetchTime() + FETCH_GENERATION_DELAY_MS);
                    writer.append(curScore, new FetchListEntry(shouldFetch, page, results));
                }
            } finally {
                webdb.close();
            }
        } finally {
            writer.close();
        }

        //
        // The next step is to sort the file we created above.
        // after being sorted, we add the "topN" items to the
        // TableSet.
        //
        File sortedFile = new File(workingDir, TOP_N_SORTER + ".sorted");
        SequenceFile.Sorter topNSorter = new SequenceFile.Sorter(SortableScore.class, FetchListEntry.class);
        topNSorter.sort(unsortedFile.getPath(), sortedFile.getPath());

        //
        // Last of all, add the topN items to the table set.
        //
        // This is also where we rewrite the WebDB - we need to do
        // this so we can modify the "date" field.  Rewriting the
        // db can be expensive, but it's that modification that will
        // allow us to interleave fetching and db-update.
        //
        WebDBWriter dbwriter = new WebDBWriter(dbDir);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(sortedFile.getPath());
            try {
                SortableScore key = new SortableScore();
                FetchListEntry value = new FetchListEntry();
                while (topN > 0 && reader.next(key, value)) {
                    tables.append(value);
                    topN--;

                    //
                    // Modify the Page in the webdb so that its date
                    // is set forward a week.  This way, we can have
                    // generate two consecutive different fetchlists
                    // without an intervening update.  So, we generate
                    // lists A and B, and start fetching A.  Upon
                    // completion, we use A to update the db, and start
                    // fetching B.  This way we have simultaneous
                    // dbupdate and page fetch, which should double
                    // our throughput.
                    //
                    dbwriter.addPage(value.getPage());
                }
            } finally {
                reader.close();
            }
        } finally {
            dbwriter.close();
        }
    }

    /**
     * Generate a fetchlist from the pagedb and linkdb
     */
    public static void main(String argv[]) throws IOException, FileNotFoundException {
        if (argv.length < 2) {
            System.out.println("Usage: FetchListTool <db_dir> <segment_dir> [-refetchonly] [-anchoroptimize linkdb] [-topN N] [-cutoff cutoffscore] [-numFetchers numFetchers] [-adddays numDays]");
            return;
        }

        //
        // Required args
        //
        File dbDir = new File(argv[0]);
        File segmentDir = new File(argv[1]);
        long curTime = System.currentTimeMillis();

        //
        // Optional args
        //
        boolean refetchOnly = false, anchorOptimize = false;
        long topN = Long.MAX_VALUE;
        float cutoffScore = -1.0f;
        int numFetchers = 1;
        int seed = new Random().nextInt();


        try {
            for (int i = 2; i < argv.length; i++) {
                if ("-refetchonly".equals(argv[i])) {
                    refetchOnly = true;
                } else if ("-anchoroptimize".equals(argv[i])) {
                    anchorOptimize = true;
                } else if ("-topN".equals(argv[i])) {
                    if (i+1 < argv.length) {
                        topN = Long.parseLong(argv[i+1]);
                        i++;
                    } else {
                        System.out.println("No argument present for -topN");
                        return;
                    }
                } else if ("-cutoff".equals(argv[i])) {
                    if (i+1 < argv.length) {
                        cutoffScore = Float.parseFloat(argv[i+1]);
                        i++;
                    } else {
                        System.out.println("No argument present for -cutoffscore");
                        return;
                    }
                } else if ("-numFetchers".equals(argv[i])) {
                    if (i+1 < argv.length) {
                        numFetchers = Integer.parseInt(argv[i+1]);
                        i++;
                    } else {
                        System.out.println("No argument present for -numFetchers");
                        return;
                    }
                } else if ("-adddays".equals(argv[i])) {
                    if (i+1 < argv.length) {
                        long numDays = Integer.parseInt(argv[i+1]);
                        curTime += numDays * 1000L * 60 * 60 * 24;
                    } else {
                        System.out.println("No argument present for -adddays");
                        return;
                    }
                }
            }
        } catch (NumberFormatException nfe) {
            System.out.println("Badly-formatted number:: " + nfe);
            return;
        }


        //
        // Check that args are consistent
        //
        if (anchorOptimize && !refetchOnly) {
            System.out.println("Tool cannot use -anchoroptimize option without -refetchonly option as well.");
            return;
        }

        //
        // Finally, start things up.
        //
        LOG.info("FetchListTool started");
        if (topN != Long.MAX_VALUE) {
            LOG.info("topN:" + topN);
        }
        if (cutoffScore >= 0) {
            LOG.info("cutoffscore:" + cutoffScore);
        }
        if (numFetchers > 1) {
            LOG.info("seed:" + seed);
        }

        FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize, cutoffScore, seed);
        if (numFetchers > 1) {
            flt.emitMultipleLists(segmentDir, numFetchers, topN, curTime);
        } else {
            flt.emitFetchList(segmentDir, topN, curTime);
        }
        LOG.info("FetchListTool completed");
    }
}